Mental Health in Tech Workplace


In this project, our group wants to gauge the prevalence of certain mental health disorders within the tech industry, and how mental health is viewed within the tech/IT workplace. We used dataset from 2016 OSMI Mental Health in Tech Survey.


Prevalence of Mental Health

Overview

In this section, we want to analyze how different groups of people(by sex and age)perform differently in mental health conditions.
library(dplyr)
library(forcats)
library(ggplot2)
library(data.table)
library(ggpubr)
library(ggthemes)
library(tidyverse)
library(reshape2)
library(RColorBrewer)
library(readxl)
library(rgdal)
library(sp)
library(raster)
library(leaflet)
library(maps)
library(ggmap)
library(plotly)
library(quanteda)
library(wordcloud)
library(wordcloud2)
library(readr)
library(dplyr)
library(tm)
library(tidytext)
library(patchwork)
library(SnowballC)

mental <- read.csv('mental-heath-in-tech-2016_20161114.csv', header = TRUE,encoding = "UTF-8")

Data Distribution

## data preprocessing
# gender info
temp <- mental[, grep('gender', colnames(mental))]
temp[! toupper(str_sub(temp, 1,1)) %in% c('F', 'M','W') ] <- 'Others'
temp[ toupper(str_sub(temp, 1,1)) %in% c('F', 'W') ] <- 'Female'
temp[ toupper(str_sub(temp, 1,1)) %in% c('M') ] <- 'Male'
gender <- temp
# age
age <- mental[, grep('What.is.your.age.', colnames(mental))]
breaks <- c(10*c(1:7))
age_category <-  cut(as.numeric(age), breaks,include.lowest = TRUE, right = FALSE, dig.lab=10)
# condition
condition <- mental[grep('condition', colnames(mental))]
#colnames(condition)
condition.diagnosed <- condition$Have.you.been.diagnosed.with.a.mental.health.condition.by.a.medical.professional.
condition.type <- condition$If.so..what.condition.s..were.you.diagnosed.with.
temp <-  str_split(condition.type, '[|]', simplify = T)

# data for plot
plotdata <- data.frame(cbind(gender = gender, age = age, age_category = as.character(age_category),
                  condition.diagnosed = condition.diagnosed,temp))
plotdata_long <- melt(plotdata, id.vars = c('gender', 'age','age_category', 'condition.diagnosed'))
#colnames(plotdata_long)

## age distribution
plotdata=plotdata[-which(plotdata$age=="323"),]
box_gender <- ggplot(plotdata) +
    geom_boxplot(aes(x = gender,y =age,  fill = gender, group = gender)) + 
  scale_y_discrete(breaks = seq(19, 99, 10)) + 
  scale_fill_manual(values = c("#E41A1C", "#449B75", "#AC5782")) +
    #scale_fill_manual(values=col) + 
    theme_bw()+ggtitle("Age Distribution by Sex")+xlab("Gender")+ylab("Age")+theme(plot.title = element_text(hjust = 0.5))
box_gender

The age of woman mainly varies from 28 to 40, similar to that, the age of man mainly varies from 29 to 39 , both with a median age of 33.


## condition.diagnosed
library(ggpubr)
figure=ggpubr::ggarrange(
  ggplot(plotdata) +
    geom_bar(aes(x = gender, fill = condition.diagnosed), position = 'dodge') +
    scale_fill_manual(values = c("#449B75", "#E41A1C")) +
    theme_bw()+ylab("Number of People")+xlab("Gender") + 
  labs(fill="Have you been diagnosed with a mental health \n condition by a medical professional?") ,
  
  ggplot(plotdata) +
    geom_bar(aes(x = age_category,  fill = condition.diagnosed), position = 'dodge') +
    scale_fill_manual(values = c("#449B75", "#E41A1C")) +
    #scale_fill_manual(palette="Set1") +
    theme_bw()+ylab("Number of People")+xlab("Age")+ 
    theme(axis.text.x = element_text(angle = 0)), common.legend = T, legend = "bottom")

annotate_figure(figure,
               top = text_grob("Condition Diagnosed by Sex and Age",face="bold"))


For female, the number of people who have been diagnosed with some types of mental health disorders is about twice as the number of people who haven’t been diagnosed. And for male, that number tends to distribute equally. Across all age categories, the number of people who have been diagnosed with conditions is somehow more than the number of people who haven’t. In general, mental health problems are prevalent and ubiquitous within the tech industry.


Mental Disorder Type

## condition plot
col <- colorRampPalette(brewer.pal(9, 'Set1'))(6)
plotdata_long <- plotdata_long %>%
  filter(! value == '')

a=unique(plotdata_long$value);a
#  [1] "Anxiety Disorder (Generalized, Social, Phobia, etc)"                                
#  [2] "Mood Disorder (Depression, Bipolar Disorder, etc)"                                  
#  [3] "Personality Disorder (Borderline, Antisocial, Paranoid, etc)"                       
#  [4] "Attention Deficit Hyperactivity Disorder"                                           
#  [5] "Seasonal Affective Disorder"                                                        
#  [6] "Depression"                                                                         
#  [7] "Substance Use Disorder"                                                             
#  [8] "Obsessive-Compulsive Disorder"                                                      
#  [9] "Post-traumatic Stress Disorder"                                                     
# [10] "Psychotic Disorder (Schizophrenia, Schizoaffective, etc)"                           
# [11] "Eating Disorder (Anorexia, Bulimia, etc)"                                           
# [12] "Autism - while not a \"mental illness\", still greatly affects how I handle anxiety"
# [13] "Stress Response Syndromes"                                                          
# [14] "attention deficit disorder (but not the hyperactive version)"                       
# [15] "Asperger Syndrome"                                                                  
# [16] "ADD (w/o Hyperactivity)"                                                            
# [17] "Addictive Disorder"                                                                 
# [18] "Schizotypal Personality Disorder"                                                   
# [19] "PDD-NOS (see above)"                                                                
# [20] "Suicidal Ideation"                                                                  
# [21] "Attention Deficit Disorder"                                                         
# [22] "Intimate Disorder"                                                                  
# [23] "Dissociative Disorder"                                                              
# [24] "Aspergers"                                                                          
# [25] "Autism"
for(i in 1:length(a)){
  if (i != 1 & i !=2 & i!=4 & i!=8  & i!=9) {
  plotdata_long[which(plotdata_long$value==a[i]),"value"]<-"others"
    }
}

ggplot(plotdata_long) +
  geom_bar(aes(x = age_category, fill = value),position = 'stack') +
  facet_wrap(~gender) +
  scale_fill_manual(values=col) +
  theme_bw() +
  theme(legend.title = element_blank(),
        legend.position = 'bottom',
        legend.text = element_text(size = 9),
        axis.text.x = element_text(angle = 90))+
  guides(fill = guide_legend(ncol = 2,
                             keywidth = unit(4,'mm'),
                             keyheight = unit(4,'mm')))+ylab("Number of People")+ggtitle("Specific Mental Health Disorders by Sex and Age")+theme(plot.title = element_text(hjust = 0.5))

Anxiety Disorder and Mood Disorder are the two most prevalent mental health disorders across all ages for both male and female within the tech industry.


Maps

Overview

In this part, we would like to show a statistics among different states in the US. To ensure that the analysis is accurate, we exclude the states whose sample size is less than 5. These states are shown as NA in the map.
df <- mental %>% 
  dplyr::select('Do.you.currently.have.a.mental.health.disorder.','Have.you.ever.sought.treatment.for.a.mental.health.issue.from.a.mental.health.professional.', 'What.US.state.or.territory.do.you.live.in.','Do.you.have.medical.coverage..private.insurance.or.state.provided..which.includes.treatment.of..mental.health.issues.','Does.your.employer.provide.mental.health.benefits.as.part.of.healthcare.coverage.')
names(df) <- c('disorder1','treatment','state','mental_insurance','employer_benefit1')
df <- df %>%
  mutate(disorder = case_when(
  disorder1 == 'Yes'  ~ 1,
  disorder1 == 'Maybe'  ~ 0.5,
  disorder1 == 'No'  ~ 0))
df <- df %>%
  mutate(employer_benefit = case_when(
  employer_benefit1 == 'Yes'  ~ 1,
  TRUE  ~ 0))
df <- df %>% 
  dplyr::select('state','treatment','employer_benefit','disorder')
#head(df)


df1 <- df %>% 
  dplyr::select('state','treatment')
df_treatment1 <- df1 %>% dplyr::group_by(state) %>% dplyr::summarise(count=n())
df_treatment2 <- df1 %>% filter(treatment==1) %>% dplyr::group_by(state) %>% dplyr::summarise(count1=n())
df_treatment <- df_treatment1 %>% 
  dplyr::left_join(df_treatment2, by = "state") %>% 
  dplyr::mutate(treatment_prec = count1/count) %>% 
  dplyr::filter(count>4)

#df_treatment <- df1 %>% group_by(state) %>% summarise(treatment_perc = sum('treatment'== 1))
#df1 <- group_by(df, state) %>% mutate(disorder_percent = disorder/sum(disorder))
#head(df_treatment1)
#sum(df_treatment1$count)


df2 <- df %>% 
  dplyr::select('state','employer_benefit')
df_employer_benefit1 <- df2 %>% dplyr::group_by(state) %>% dplyr::summarise(count=n())
df_employer_benefit2 <- df2 %>% dplyr::filter(employer_benefit==1) %>% dplyr::group_by(state) %>% dplyr::summarise(count1=n())
df_employer_benefit <- df_employer_benefit1 %>% 
  dplyr::left_join(df_employer_benefit2, by = "state") %>% 
  dplyr::mutate(employer_benefit_prec = count1/count) %>% 
  dplyr::filter(count>4)


df_final <- merge(x = df_employer_benefit, y = df_treatment, by = "state", all = TRUE)  %>%
  dplyr::select('state','treatment_prec','employer_benefit_prec')
#df_final$treatment_prec <- scales::percent(df_final$treatment_prec)
#df_final$employer_benefit_prec <- scales::percent(df_final$employer_benefit_prec)
#df_final$treatment_prec <- round(df_final$treatment_prec, digits = 2)
#df_final$employer_benefit_prec <- round(df_final$employer_benefit_prec, digits = 2)

round_df <- function(x, digits) {
    numeric_columns <- sapply(x, mode) == 'numeric'
    x[numeric_columns] <-  round(x[numeric_columns], digits)
    x
}

df_final <- round_df(df_final, 3)
# From https://www.census.gov/geo/maps-data/data/cbf/cbf_state.html

states <- shapefile("cb_2018_us_state_20m.shp")

combined <- states@data %>% 
  left_join(df_final, by = c(NAME = "state"))
states@data <- combined

Frequency of Employee Who Sought Mental Health Treatment

bins <- c(0, 0.2, 0.4, 0.6, 0.8, 1)
pal <- colorBin("YlOrRd", domain = states$treatment_prec, bins = bins, right=TRUE)
labels_states <- paste( states$NAME,
                    "Employee Sougnt Treatment Frequency:", states$treatment_prec)

m <- leaflet() %>%
  setView(-96, 37.8, 4) %>%
  addProviderTiles("MapBox", options = providerTileOptions(
    id = "mapbox.light",
    accessToken = Sys.getenv('MAPBOX_ACCESS_TOKEN'))) %>% 
  addPolygons(data=states,
    fillColor = ~pal(treatment_prec),
    weight = 2,
    opacity = 1,
    color = "white",
    dashArray = "3",
    fillOpacity = 0.7,
    highlight = highlightOptions(
      weight = 5,
      color = "#666",
      dashArray = "",
      fillOpacity = 0.7,
      bringToFront = TRUE),
    label = labels_states,
    labelOptions = labelOptions(
      style = list("font-weight" = "normal", padding = "3px 8px"),
      textsize = "15px",
      direction = "auto")) %>%
  addLegend(data=states, pal = pal, title = "Frequency of Employee Who Sought Treatment", values = ~treatment_prec, opacity = 0.7,
    position = "bottomright")
m
Among all the states, Iowa, North Carolina, New Jersy and Connecticut have the highest scores (>0.8), while Kansas, Missouri and Arizona have the lowest scores (<0.2). For those states with high scores, the percentage of survey participants who have sought mental health is higher, suggesting that the overall mental health situation is worse.


Frequency of Employer with Mental Health Benefit

bins <- c(0, 0.2, 0.4, 0.6, 0.8, 1)
pal <- colorBin("YlOrRd", domain = states$employer_benefit_prec, bins = bins)
labels_states <- paste( states$NAME,
                    "Employer with Mental Healthcare Benefit Frequency:", states$employer_benefit_prec)

n <- leaflet() %>%
  setView(-96, 37.8, 4) %>%
  addProviderTiles("MapBox", options = providerTileOptions(
    id = "mapbox.light",
    accessToken = Sys.getenv('MAPBOX_ACCESS_TOKEN'))) %>% 
  addPolygons(data=states,
    fillColor = ~pal(employer_benefit_prec),
    weight = 2,
    opacity = 1,
    color = "white",
    dashArray = "3",
    fillOpacity = 0.7,
    highlight = highlightOptions(
      weight = 5,
      color = "#666",
      dashArray = "",
      fillOpacity = 0.7,
      bringToFront = TRUE),
    label = labels_states,
    labelOptions = labelOptions(
      style = list("font-weight" = "normal", padding = "3px 8px"),
      textsize = "15px",
      direction = "auto")) %>%
  addLegend(data=states, pal = pal, title = "Frequency of Employer with Mental Healthcare Benefit", values = ~employer_benefit_prec, opacity = 0.7,
    position = "bottomright")


n
In this plot, we would like to show the percentage of employers who provide mental healthcare benefit. Compared to the previous plot, none of the states scores higher than 0.8 in this analysis, meaning that the overall mental healthcare benefit is not good enough. Among all the states, Iowa is still in the highest range, suggesting that although its mental health benefit is good, the mental health is still not as optimal. Besides, Michigan, Pennsylvania, Tennessee and Missouri also do well in mental healthcare. New Jersey scores the lowest in the US in mental healthcare, while its employee mental health situation is also among the worst.


Mental Health in TECH/IT Workplace

Overview

Following our analysis of employees who sought mental treatment and had mental healthcare benefit, we would like to continue with relationship between mental issues and reactions in the workplace.


Reactions by Gender and Company Size

We will mainly focus on the three questions below and see how gender and company size would differentiate the answer.
Q1: Would you bring up a mental health issue with a potential employer in an interview?
Q2: Do you feel that being identified as a person with a mental health issue would hurt your career?
Q3: Do you think that team members/co-workers would view you more negatively if they knew you suffered from a mental health issue?
mental1 <- mental %>% filter(gender == 'Male')%>%
          dplyr::group_by(company_size,gender, bringup_issue) %>%
          dplyr::summarise(frequency=n())%>%
          dplyr::mutate(frequency_ratio = frequency / sum(frequency)*100)

mental2 <- mental %>% filter(gender == 'Female')%>%
          dplyr::group_by(company_size,gender, bringup_issue) %>%
          dplyr::summarise(frequency=2.32*n())%>% #times male/female ratio
          dplyr::mutate(frequency_ratio = frequency / sum(frequency)*100)

mental3 <- rbind(mental1, mental2)
fig<- ggballoonplot(mental3, y = "company_size", x = "bringup_issue", fill = "frequency", size="frequency",
             facet.by = "gender", ggtheme = theme_bw())+
  scale_fill_viridis_c(option = "D",direction = -1)
fig <-  ggpar(fig,main = "Company Size & Whether You Would Mention Mental Issues During Interview", xlab = "Dose (mg)", ylab = "Company size")
fig

From the gender perspective, it is quite uncommon for female respondents to bring up mental issues during interviews, while there are more male respondents who bring them up. From the company size perspective , employees from all company sizes tend to reject mentioning mental issues during interviews. Companies with 26-500 employees are more unlikely to receive mental issues during the interview according to the answer distribution.
mental4 <- mental %>% filter(gender == 'Male')%>%
          dplyr::group_by(company_size,gender, mental_hurt_career) %>%
          dplyr::summarise(frequency=n())%>%
          dplyr::mutate(frequency_ratio = frequency / sum(frequency)*100)

mental5 <- mental %>% filter(gender == 'Female')%>%
          dplyr::group_by(company_size,gender, mental_hurt_career) %>%
          dplyr::summarise(frequency=2.32*n())%>% #times male/female ratio
          dplyr::mutate(frequency_ratio = frequency / sum(frequency)*100)

mental6 <- rbind(mental4, mental5)
fig2<- ggballoonplot(mental6, y = "company_size", x = "mental_hurt_career", fill = "frequency", size="frequency",
             facet.by = "gender", ggtheme = theme_bw())+
  scale_fill_viridis_c(option = "D",direction = -1)
fig2 <-  ggpar(fig2,main = "Company Size & Whether Mental Health Issue Would Hurt Your Career", xlab = "Answer", ylab = "Company size")
fig2

From the gender perspective, female respondents tend to admit mental issues would hurt their careers, while certain males are more likely to consider mental issues would not do harm to careers. From the company size perspective , employees from all company sizes tend to connect mental issues with harm to careers. Especially, respondents from companies with 26-500 or more than 1000 employees are more likely to agree with this statement. In addition, responding with ambiguous answers is also common for respondents from all levels of companies.
mental7 <- mental %>% filter(gender == 'Male')%>%
          dplyr::group_by(company_size,gender, coworker_views) %>%
          dplyr::summarise(frequency=n())%>%
          dplyr::mutate(frequency_ratio = frequency / sum(frequency)*100)

mental8 <- mental %>% filter(gender == 'Female')%>%
          dplyr::group_by(company_size,gender, coworker_views) %>%
          dplyr::summarise(frequency=2.32*n())%>% #times male/female ratio
          dplyr::mutate(frequency_ratio = frequency / sum(frequency)*100)

mental9 <- rbind(mental7, mental8)
fig3<- ggballoonplot(mental9, y = "company_size", x = "coworker_views", fill = "frequency", size="frequency",
             facet.by = "gender", ggtheme = theme_bw())+
  scale_fill_viridis_c(option = "D",direction = -1)
fig3 <-  ggpar(fig3,main = "Company Size & Whether Coworkers Will View You More Negatively", xlab = "Answer", ylab = "Company size")
fig3

We can see male and female respondents tend to distribute equally in this question. Respondents from companies with 26-100 or more than 1000 employees are more likely to agree that coworkers will view them more negatively.


Reactions by Job Type

For the five plots below, we can see the answers to the question “whether mental health issues would hurt your careers” based on job types that are most common in the survey, including back-end engineers, front-end engineers, Team Lead, DevOps and Dev Evangelist.
library(stringr) 
library(waffle) 
library(viridis) 
setnames(mental,"Which.of.the.following.best.describes.your.work.position.","work_position_general") 
 
work_position <- mental$work_position_general 
work_position <- word(work_position,1,sep = "\\|") 
work_position <- as.data.frame(work_position) 
mental <- cbind(mental, work_position) 
arrange(plyr::count(mental, 'work_position'),desc(freq)) %>% filter(work_position!="Other") %>% top_n(6)
#arrange by job types 
mental_wp_back <- mental %>% dplyr::filter(work_position== "Back-end Developer") %>% dplyr::select(mental_hurt_career, work_position) 
back <- table(mental_wp_back$mental_hurt_career) 
back_w <- waffle(back, rows = 12, 
                 #xlab="Whether mental health issue would hurt your career",
                 colors = viridis::viridis(5),reverse=TRUE) + 
  labs(title = "Back-end Developer") + 
  theme(plot.title = element_text(color = "black", size = 10))
 
mental_wp_front <- mental%>% dplyr::filter(work_position== "Front-end Developer") %>% dplyr::select(mental_hurt_career, work_position)                 
front <- table(mental_wp_front$mental_hurt_career) 
front_w <- waffle(front, rows = 10, 
                  #xlab="Whether mental health issue would hurt your career",
                  colors = viridis::viridis(5),reverse=TRUE)+ 
  labs(title = "Front-end Developer") + 
  theme(plot.title = element_text(color = "black", size = 10))
 
mental_wp_lead <- mental %>% dplyr::filter(work_position== "Supervisor/Team Lead")%>% dplyr::select(mental_hurt_career, work_position) 
lead <- table(mental_wp_lead$mental_hurt_career) 
lead_w <- waffle(lead, rows = 10, 
                 #xlab="Whether mental health issue would hurt your career",
                 colors = viridis::viridis(5),reverse=TRUE)+ 
  labs(title = "Supervisor/Team Lead") + 
  theme(plot.title = element_text(color = "black", size = 10))
 
mental_wp_devops <- mental %>% dplyr::filter(work_position== "DevOps/SysAdmin")%>% dplyr::select(mental_hurt_career, work_position) 
devops<- table(mental_wp_devops$mental_hurt_career) 
devops_w <- waffle(devops, rows = 10, 
                   #xlab="Whether mental health issue would hurt your career",
                   colors = viridis::viridis(4),reverse=TRUE)+ 
  labs(title = "DevOps/SysAdmin") + 
  theme(plot.title = element_text(color = "black", size = 10))
 
mental_wp_advocate <- mental %>% dplyr::filter(work_position== "Dev Evangelist/Advocate")%>% dplyr::select(mental_hurt_career, work_position) 
advocate<- table(mental_wp_advocate$mental_hurt_career) 
advocate_w <- waffle(advocate, rows = 10, 
                     #xlab="Whether mental health issue would hurt your career",
                     colors = viridis::viridis(5),reverse=TRUE)+ 
  labs(title = "Dev Evangelist/Advocate") + 
  theme(plot.title = element_text(color = "black", size = 10))

mental_wp_support <- mental %>% dplyr::filter(work_position== "Support")%>% dplyr::select(mental_hurt_career, work_position) 
support<- table(mental_wp_support$mental_hurt_career) 
support_w <- waffle(support, rows = 10, 
                     #xlab="Whether mental health issue would hurt your career",
                     colors = viridis::viridis(4),reverse=TRUE)+ 
  labs(title = "Support") + 
  theme(plot.title = element_text(color = "black", size = 10))


back_w + lead_w+ front_w + devops_w + advocate_w + support_w + plot_layout(nrow =3, byrow = FALSE) + plot_annotation('Survey: Whether Mental Health Issue Would Hurt Your Career')

We can see most of the respondents from different job types agree with the statement that mental health issues would hurt their careers, despite the fact that around one-third of them answer “maybe”.


Text Analysis

Overview

In this section, we want to further explore people’s attitudes towards mental health in workplace. We did text analysis using participants’ answer for would you bring up a mental health issue with a potential employer in an interview, why and why not.
#import data
mental_health <- read_csv("mental-heath-in-tech-2016_20161114.csv")
head(mental_health[38])
names(mental_health)[37:38] <- c("menissue_interview","text")
mental_health <- filter(mental_health, !is.na(text)) 
menissue_interview <- as.data.frame(table(mental_health$menissue_interview)) 
names(menissue_interview) <- c("Would you bring up a mental health issue with a potential employer in an interview?", "Frequency")
menissue_interview


Wordcloud for Each Answer

#answer = maybe
#create corpus
mental_health_maybe <- filter(mental_health, menissue_interview == "Maybe")
mental_health_maybe$doc_id <- as.character(c(1:nrow(mental_health_maybe)))
mental_health_maybe <- mental_health_maybe[, c(62,38)]
maybe_corpus <- DataframeSource(mental_health_maybe) %>% VCorpus(.)
#clean corpus
clean_corpus <- function(corpus){
  corpus <- tm_map(corpus, removePunctuation)
  corpus <- tm_map(corpus, content_transformer(tolower))
  corpus <- tm_map(corpus, removeWords, c("mental", "health", "interview", "feel", "bring", "want", "made", "get", "employer", "hire", "need", "know","sure", "may", "affect", "job", stopwords("en")))
  corpus <- tm_map(corpus, removeNumbers)
  corpus <- tm_map(corpus, stripWhitespace)
  return(corpus)
}
maybe_clean <- clean_corpus(maybe_corpus)
#stem and stem completion
stemCompletion2 <- function(x, dictionary) {
   x <- unlist(strsplit(as.character(x), " "))
   x <- x[x != ""]
   x <- stemCompletion(x, dictionary=dictionary)
   x <- paste(x, sep="", collapse=" ")
   stripWhitespace(x)
}
maybe_stemmed <- tm_map(maybe_clean, stemDocument)
maybe_compl <- lapply(maybe_stemmed, stemCompletion2, dictionary = maybe_clean) %>% VectorSource() %>% Corpus()
#word cloud
maybe_tdm <- TermDocumentMatrix(maybe_compl)
maybe_tf_idf <- tidy(maybe_tdm)  %>%
  bind_tf_idf(term, document, count) %>%
  arrange(desc(tf_idf))
set.seed(1000)
wordcloud2(maybe_tf_idf[, c(1,4)], color = "random-dark", shape = "diamond")
The word cloud above shows the most common words used by participants answering maybe. They use uncertain words like I don’t know and unsure a lot, which is consistent with their answer.


#answer = yes
#create corpus and clean
mental_health_yes <- filter(mental_health, menissue_interview == "Yes")
mental_health_yes$doc_id <- as.character(c(1:nrow(mental_health_yes)))
mental_health_yes <- mental_health_yes[, c(62,38)]
yes_corpus <- DataframeSource(mental_health_yes) %>% VCorpus(.)
yes_clean <- clean_corpus(yes_corpus)
#stem and stem completion
yes_stemmed <- tm_map(yes_clean, stemDocument)
yes_compl <- lapply(yes_stemmed, stemCompletion2, dictionary = yes_clean) %>% VectorSource() %>% Corpus()
#word cloud
yes_tdm <- TermDocumentMatrix(yes_compl)
yes_tf_idf <- tidy(yes_tdm)  %>%
  bind_tf_idf(term, document, count) %>%
  arrange(desc(tf_idf))
set.seed(1000)
wordcloud2(yes_tf_idf[, c(1,4)], color = "random-dark", shape = "diamond")
The word cloud above shows the most common words used by participants answering yes. Words like affects, effect and relevant show that they think the issue is important. Although positive words like advovate are used, they also use words like unfortunate to show their worries.


#answer = no
#create corpus and clean
mental_health_no <- filter(mental_health, menissue_interview == "No")
mental_health_no$doc_id <- as.character(c(1:nrow(mental_health_no)))
mental_health_no <- mental_health_no[, c(62,38)]
no_corpus <- DataframeSource(mental_health_no) %>% VCorpus(.)
no_clean <- clean_corpus(no_corpus)
#stem and stem completion
no_stemmed <- tm_map(no_clean, stemDocument)
no_compl <- lapply(no_stemmed, stemCompletion2, dictionary = no_clean) %>% VectorSource() %>% Corpus()
#word cloud
no_tdm <- TermDocumentMatrix(no_compl)
no_tf_idf <- tidy(no_tdm)  %>%
  bind_tf_idf(term, document, count) %>%
  arrange(desc(tf_idf))
set.seed(1000)
wordcloud2(no_tf_idf[, c(1,4)], color = "random-dark", shape = "diamond")
The word cloud above shows the most common words used by participants answering no. They tend to use negative words in answer, such as lose, taboo, and cost.


Word Frequency for Each Answer

#top 10 words in maybe
b1 <- maybe_tf_idf %>% dplyr::group_by(term) %>% 
  dplyr::summarize("frequency"=sum(count)) %>% slice_max(frequency, n=10) %>% 
  ggplot(aes(reorder(term, frequency), frequency)) + 
  geom_bar(stat = "identity", fill="#AC5782") + coord_flip() + 
  ggtitle("Answer = Maybe") +
  theme(axis.title.y = element_blank(),
        axis.title.x = element_blank(),
        panel.background = element_blank(), 
        plot.title = element_text(face = "bold", color = "black", size = 10))

#top 10 words in yes
b2 <- yes_tf_idf %>% dplyr::group_by(term) %>% 
  dplyr::summarize("frequency"=sum(count)) %>% slice_max(frequency, n=10) %>% 
  ggplot(aes(reorder(term, frequency), frequency)) + 
  geom_bar(stat = "identity", fill="#E41A1C") + coord_flip() + 
  ggtitle("Answer = Yes") +
  theme(axis.title.y = element_blank(),
        axis.title.x = element_blank(),
        panel.background = element_blank(), 
        plot.title = element_text(face = "bold", color = "black", size = 10))
#top 10 words in no
b3 <- no_tf_idf %>% dplyr::group_by(term) %>% 
  dplyr::summarize("frequency"=sum(count)) %>% slice_max(frequency, n=10) %>% 
  ggplot(aes(reorder(term, frequency), frequency)) + 
  geom_bar(stat = "identity", fill = "#449B75") + coord_flip() + 
  ggtitle("Answer = No") +
  theme(axis.title.y = element_blank(),
        axis.title.x = element_blank(),
        panel.background = element_blank(), 
        plot.title = element_text(face = "bold", color = "black", size = 10))
b1+b3+b2+plot_annotation('Top 10 Most Frequent Words for Each Answer')

Participants who answer maybe tend to use uncertain words like depends and discuss, but they also use some negative words such as stigma and wouldn’t. Those who answer yes tend to use positive words like important and good while those answering no use negative words such as stigma, don’t, wouldn’t and negatively.


Sentiment Analysis for Each Answer

#import Hu & Liu Dictionary
pos <- read.table("positive-words.txt", as.is=T)
neg <- read.table("negative-words.txt", as.is=T)
#define sentiment fuction
sentiment <- function(words){
  tok <- quanteda::tokens(words)
  pos.count <- sum(tok[[1]]%in%pos[,1])
  neg.count <- sum(tok[[1]]%in%neg[,1])
  out <- (pos.count - neg.count)/(pos.count+neg.count)
  return(out)
}
#calculate the sentiment
mental_health$sentiment <- sapply(mental_health$text, sentiment)

#plot the relationship between sentiment and answer
p1 <- ggplot(mental_health, aes(x = menissue_interview, y = sentiment)) + 
  geom_boxplot(aes(fill=menissue_interview))  + 
  stat_summary(mapping=aes(group=menissue_interview),fun="mean",geom="point",shape=23,size=3,fill="white") + 
  scale_fill_manual(values = c("#AC5782", "#449B75", "#E41A1C")) +
  labs(title = "Distribution of Sentiment Score by Answer", y = "Sentiment Score") + 
  theme(legend.position = 'none', 
        axis.title.x = element_blank(),
        axis.title.y = element_text(vjust = 2),
        panel.background = element_blank(), 
        panel.grid.major = element_line(color = "gray50", size = 0.5),
        panel.grid.major.x = element_blank(),
        plot.title = element_text(face = "bold", color = "black", size = 12))
pp1 <- ggplotly(p1, tooltip = "sentiment")
pp1
Respondents who answer maybe show a negative sentiment towards the question with mean score -0.41 and median score -1. Compared with them, those answering no show a more negative sentiment with mean score -0.43 and median score -1. However, those with answer yes present a totally different sentiment. They get an average sentiment score of 0.21 and median score of 0.17.